import  time,pymysql,pandas,re
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from datetime import datetime
from requests_html import HTMLSession   
session = HTMLSession()
settings = { 'host':'172.16.10.201','user':'zy001','port':3306,'password':'zy@123','database':'crawl-bk','charset' : 'utf8'}
conn = pymysql.connect(**settings)
cursor = conn.cursor()
option = ChromeOptions()

prefs = {
        'profile.default_content_setting_values': {
            'images': 2
        }
    }
# option.headless = True
# option.add_experimental_option('prefs', prefs)
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument('--ignore-certificate-errors')





area_first_url = {
    '盘龙':'https://km.zu.fang.com/house-a016831/z62/',
    '五花':'https://km.zu.fang.com/house-a016832/z62/',
    '官渡':'https://km.zu.fang.com/house-a016830/z62/',
    '西山':'https://km.zu.fang.com/house-a016833/z62/',
    '安宁':'https://km.zu.fang.com/house-a01069/z62/',
    '宜良':'https://km.zu.fang.com/house-a011163/z62/',
    '呈贡':'https://km.zu.fang.com/house-a01071/z62/',
}


# 列表数据抓取
def save_list(area_first_url,driver,redict_url):
    for k,v in area_first_url.items():
        driver.get(v)
        for i in range(1,101):
            page = str(i)
            url = v[:-4]+'i3'+page+'-z62/'
            if i == 1:
                url =  v
            driver.get(url)
            # 判断此次请求是否需要验证
            while(redict_url in driver.current_url):
                driver.get(v)
                time.sleep(10)           
            info = driver.find_elements_by_xpath("//div[@class='houseList']/dl/dd")
            ls = []
            for i in info:
                title = i.find_element_by_xpath("./p/a").text
                href = i.find_element_by_xpath("./p/a").get_attribute('href')
                ls.append((title,'住宅',url,href,k,datetime.now()))
            sql = "insert into fangtianxia_rent_all_total(title,types,list_url,url,region,create_time) values(%s,%s,%s,%s,%s,%s)"
            cursor.executemany(sql,ls)
            conn.commit()
            # 日志记录
            s = "insert into crawl_log(tables,url,described,create_time) values(%s,%s,%s,%s)"
            tables = 'fangtianxia_rent_all_total'
            described = '完成第'+page+'页列表数据的抓取'
            row = (tables,url,described,datetime.now())
            cursor.execute(s,row)
            conn.commit()



   

def find_params(url,text):
    # 获取params
    rfss= ''
    if url:
        index = text.index('rfss=')
        for i in text[index+5:]:
            if i == "'":
                break
            rfss += i
        return url+'?rfss='+rfss
    index = text.index('houseInfo = ')
    for i in text[index+12:]:
        if i == ";":
            break
        rfss += i
    return rfss



def change(res):
    res = re.sub(r'(\w*?):', r'"\1":', res)
    res = re.sub(r'"(\d{2})":"(\d{2})":(\d{2}\.\d{3})"', r'\1:\2:\3"', res)
    res = re.sub(r'"(\d{2})":"(\d{2})":(\d{2})"', r'\1:\2:\3"', res)
    return res

       
def test():
    url = 'https://km.zu.fang.com/chuzu/3_164242618_1.htm'
    driver = webdriver.Chrome(executable_path='chromedriver', options=option)
    url = 'https://km.zu.fang.com/chuzu/3_164147262_1.htm'
    driver.get(url)

    import json
    from requests_html import HTMLSession   
    session = HTMLSession()

    sql = 'select url from fangtianxia_rent_all_total where id >48'
    cursor.execute(sql)
    data = cursor.fetchall()
    a = 0
   
      

def save_detail(session,url):
    u = url
    # driver = webdriver.Chrome(executable_path='chromedriver', options=option)
    # driver.get(url)
    # driver.close()

    # 当前url由列表获取得到，没有params参数
    req = session.get(url)
    text = req.html.text
    url = find_params(url,text)
    print(url)
    req = session.get(url)
    if req.status_code !=200:
        return "存储失败"
    try:
        data = find_params(text=req.html.text,url=None).replace(' ','')
    except Exception as a:
        return '重定向到了列表'
    dic = eval(change(data))
    dic['create_detail_time'] =  datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    value= ''
    for k,v in dic.items():
        s = k+"='"+v+"',"
        value += s

    sql = 'update fangtianxia_rent_houst SET  '+value[:-1]+" where url = '"+u+"';"
    cursor.execute(sql)
    conn.commit()
    return '成功·' 


if __name__ == '__main__':
    # 验证路由
    # redict_url = 'http://search.fang.com/'
    # driver = webdriver.Chrome(executable_path='chromedriver', options=option)
    # 列表存储
    # save_list(area_first_url,driver,redict_url)

    # 详情存储
    sql = 'select url from fangtianxia_rent_houst where isNorth is null '
    cursor.execute(sql)
    data = cursor.fetchall()
    for i in data[2:]:
        print(i[0])
        # break
        try:
            save_detail(session,url=i[0])
        except:
            pass
            conn.rollback()

conn.close()
cursor.close()
